/****************************************************************************************
***
***  Program: ffsa analysis 072018.sas
***  Purpose: June 2018 FFSA analysis with average number of claims per HCC factored
***           into the calculations.
***
***           This code is a modification Z:\HHS\CMS\RADV MEDI PART C COMPOSITE ERROR CALC\
***                FFSA Calibration\SAS Code\ffsa analysis 030718.sas
***
***
***
***
***	 Datasets: 50 permanent datasets of identical structure are saved within the bootstrap macro.  
***			   These datasets have the naming convention rs_compare_# where their suffix
***			   identifies the iteration of the bootstrap. Additionally, 50 pdf documents
***			   are produced which contain values in the rs_compare_# datasets.
***
***
****************************************************************************************/
 
options mprint;
title "FFSA Analysis June 2018";
libname dat "\\ipr2012core\data\PROJECTS\HHS\CMS\RADV MEDI PART C COMPOSITE ERROR CALC\FFSA Calibration\June 2018 Analysis\Input Data";
libname dat1 "\\ipr2012core\data\PROJECTS\HHS\CMS\RADV MEDI PART C COMPOSITE ERROR CALC\FFSA Calibration";
libname datout "\\ipr2012core\data\PROJECTS\HHS\CMS\RADV MEDI PART C COMPOSITE ERROR CALC\FFSA Calibration\June 2018 Analysis\Output";
filename outfile2 "\\ipr2012core\data\PROJECTS\HHS\CMS\RADV MEDI PART C COMPOSITE ERROR CALC\FFSA Calibration\June 2018 Analysis\Output\pre_pert_coeffs_072618.pdf";


/* Input the 2 data files:

   POPHCC: This is the orignal FFS data (1.4 million records). It has the original
           expenditures and HCC information. In the June 2018 analysis, this includes
           the number of claims per HCC for each enrollee.
   MASTERMA: This is the random sample of 2 million MA enrollees.  */ 

	data pophcc;
	   set dat1.y5r1s15f;
	run;

	proc contents data=pophcc;
	run;

	data masterma(rename=(originally_disabled_female_aged=originallydisabled_female 
	                      originally_disabled_male_aged=originallydisabled_male
	                      dm_cvd=dm_cvd_70hccs copd_cvd_cad=copd_cvd_cad_70hccs));
	   set dat1.samptb_y13_full1m dat1.samptb_y13_elig1m;
	   length f0_34 -- f95_gt m0_34 -- m95_gt 4 hcc1 -- hcc177 3;
	   apy05commabad=.;
	   newexpenditures=.;
	   ma_enrollee_number=_n_;
	   keep apy05commabad newexpenditures f0_34 -- f95_gt m0_34 -- m95_gt hcc1 -- hcc177
	        originally_disabled_female_aged 
	        medicaid_female_aged medicaid_female_disabled  originally_disabled_male_aged 
	        medicaid_male_Aged medicaid_male_disabled d_hcc5 d_hcc44 d_hcc51
	        d_hcc52 d_hcc107 dm_chf1 dm_cvd chf_copd copd_cvd_cad rf_chf1 rf_chf_dm
	        ma_enrollee_number;
	run;

	/* Examine the contents of the files */

	proc contents data=pophcc;
	run;
	proc contents data=masterma;
	run;

	proc print data=pophcc (obs=1);
	   title2 "1 Observation from POPHCC";
	run;
	proc print data=masterma (obs=10);
	   title2 "10 Observations from MASTERMA";
	run;

	/* Verify that APY05COMMABAD is missing for all MA enrolleees. */

	proc freq data=masterma;
	   tables apy05commabad/missing;
	   title2 "Frequency Distribution of APY05COMMABAD for MA Enrollees";
	run;


/***********/
/* Step 1
   
   Regress expenses on the original FFS data (unadjusted expenditures and unRADVed HCCs).
   Calculate risk factors for the model (estimated coefficients divided by average expenditures)
   and risk scores for each beneficiary.

   In this step, we predict for the MA enrollees using the unadjusted risk factors from the regression on 
   unadjusted FFS HCCs. The predicted risk scores for MA enrollees are used in Step 5 below. Note
   the regression does not use the MA enrollee data in the fit, since apy05commabad is missing
   for all MA enrollees. */

	proc means data=pophcc;
	   var apy05commabad;
	   title2 "Average Expenditures for Original FFS Data";
	run;

	data pophcc_ma;
	   set pophcc masterma;
	run;

	proc reg data=pophcc_ma outest=estcoeffs1(drop=_type_ _model_ _rmse_ _depvar_ apy05commabad);
	   model apy05commabad = f0_34 -- f95_gt m0_34 -- m95_gt hcc1 -- hcc177 
	                         d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
	                         chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm/noint; 
					                              /* HCCs, demos, interactions */
	   output out=predictions1 (keep=hicno ma_enrollee_number apy05commabad predicted) p=predicted;
	   title2 "Regression on Original FFS Data";
	run;

	proc means data=predictions1 noprint;
	   var apy05commabad predicted;
	   output out=mean_predictions1(drop=_type_ _freq_) mean=mean_expense mean_predicted;
	   where apy05commabad~=.; /* Removes MA enrollees from the calculations. */
	   title2 "Original FFS Data, Average Expenses, and Predicted Expenses";
	run;

	data predictions1;
	   set predictions1;
	   if _n_=1 then set mean_predictions1(keep=mean_expense);
	   risk_score=predicted/mean_expense;
	run;

	/*  Verify that Average Risk Score equals 1. */

	proc means data=predictions1;
	   var risk_score;
	   where apy05commabad~=.;
	   title2 "Average Risk Score";
	   title3 "Verify that Average Risk Score = 1";
	run;

	data estcoeffs1;
	   set estcoeffs1;
	   if _n_=1 then set mean_predictions1;
	   drop mean_expense;
	run;

	ods pdf body=outfile2;

	   proc print data=estcoeffs1;
	      title2 "Estimated Regression Coefficients for Original FFS Data Regression";
	      title3 "Pre-Perturbment";
	   run;

	ods pdf close;


%macro bootstrap;

   %do r=1 %to 50;

      filename outfile1 "\\ipr2012core\data\PROJECTS\HHS\CMS\RADV MEDI PART C COMPOSITE ERROR CALC\FFSA Calibration\June 2018 Analysis\Output\profile_change_dist_&r..pdf";
      filename outfile3 "\\ipr2012core\data\PROJECTS\HHS\CMS\RADV MEDI PART C COMPOSITE ERROR CALC\FFSA Calibration\June 2018 Analysis\Output\post_pert_coeffs_&r..pdf";
      filename outfilec "\\ipr2012core\data\PROJECTS\HHS\CMS\RADV MEDI PART C COMPOSITE ERROR CALC\FFSA Calibration\June 2018 Analysis\Output\comparison_&r..pdf";

	  /***********/
      /* Step 2: Randomly perturb the HCC data based on the supplied probabilities. */

      data pophcc_p;
         set pophcc;
         profile_change=0;

         /* Hierarchical HCCs. */

         %macro h(hcc,cat,n);
            if hcc&hcc.=1 then do;
 	           r=1*rand("unif");
		       category="&cat.";
		       if category='H' then do;
		          if r<.461591**&n. then do; profile_change=1; hcc&hcc.=0; end;
	           end;
		       if category='N' then do;
		          if r<.337938**&n. then do; profile_change=1; hcc&hcc.=0; end;
               end;
		       if category='L' then do;
		          if r<.209114**&n. then do; profile_change=1; hcc&hcc.=0; end;
		       end;
            end;
         %mend h;

         %h(15,L,3.290);
         %h(16,L,3.290);
         %h(18,L,2.265);
         %h(19,L,6.199);
         %h(7,N,7.002);
         %h(8,N,13.396);
         %h(9,N,11.029);
         %h(10,H,7.043);
         %h(25,N,3.735);
         %h(26,N,5.214);
         %h(27,N,3.194);
         %h(51,N,2.249);
         %h(52,N,3.568);
         %h(54,H,8.956);
         %h(55,H,6.109);
         %h(67,N,4.540);
         %h(68,N,4.058);
         %h(69,N,2.430);
         %h(157,N,3.555);
         %h(78,N,2.010);
         %h(79,L,5.005);
         %h(81,N,3.580);
         %h(82,N,2.921);
         %h(83,H,2.417);
         %h(95,N,4.671);
         %h(96,H,4.105);
         %h(100,N,3.176);
         %h(101,N,2.645);
         %h(148,N,2.650);
         %h(149,L,4.844);
         %h(104,N,4.046);
         %h(105,N,3.442);
         %h(5,H,2.715);
         %h(112,N,2.332);
         %h(111,N,3.159);
         %h(107,N,6.524);
         %h(108,L,4.980);
         %h(130,L,4.480);
         %h(131,N,4.727);
         %h(132,N,2.307);
         %h(161,N,3.066);
         %h(177,N,3.402);
         %h(154,N,2.340);
         %h(75,N,2.367);
         %h(155,H,3.128);
         %h(17,N,2.638);
         %h(77,N,3.202);

         /* Non-hierarchical HCCs. */

         %macro nh(hcc,cat,n);
            if hcc&hcc.=1 then do;
 	           r=1*rand("unif");
		       category="&cat.";
		       if category='H' then do;
		          if r<.461591**&n. then do; profile_change=1; hcc&hcc.=0; end;
	 	       end;
		       if category='N' then do;
		          if r<.337938**&n. then do; profile_change=1; hcc&hcc.=0; end;
		       end;
		       if category='L' then do;
		          if r<.209114**&n. then do; profile_change=1; hcc&hcc.=0; end;
		       end;
	        end;
         %mend nh;

         %nh(1,N,11.446);
         %nh(2,N,4.059);
         %nh(21,N,2.476);
         %nh(31,N,3.486);
         %nh(32,L,3.744);
         %nh(33,N,3.654);
         %nh(37,N,4.684);
         %nh(38,L,4.922);
         %nh(44,H,6.106);
         %nh(45,H,3.792);
         %nh(70,N,3.279);
         %nh(71,H,2.646);
         %nh(72,N,7.276);
         %nh(73,N,5.240);
         %nh(74,L,4.709);
         %nh(80,N,6.073);
         %nh(92,H,7.000);
         %nh(119,N,2.639);
         %nh(150,N,2.885);
         %nh(158,H,7.536);
         %nh(164,N,2.611);
         %nh(174,N,10.143);
         %nh(176,L,2.747);

         /* Update disabled-disease interactions */

         if sum(originallydisabled_female,originallydisabled_male)=1 then do;
            d_hcc5=hcc5;
            d_hcc44=hcc44;
	        d_hcc51=hcc51;
	        d_hcc52=hcc52;
	        d_hcc107=hcc107;
         end;

         /* Update disease-disease interactions. See page B-4 of Medicare Advantage
	        Risk Adjustment Data Validation Calendar Year 2012, List of Hierarchical
	        and Non-Hierarchical CMS-HCCs Final. */

         dm_chf1=sum(hcc15,hcc16,hcc17,hcc18,hcc19)*hcc80;
         i_cvd=0;
         if sum(hcc95,hcc96,hcc100,hcc101)>0 then i_cvd=1;
         dm_cvd_70hccs=sum(hcc15,hcc16,hcc17,hcc18,hcc19)*i_cvd;
         chf_copd=hcc80*hcc108;
         copd_cvd_cad_70hccs=hcc108*i_cvd*sum(hcc81,hcc82,hcc83);
         rf_chf1=hcc131*hcc80;
         rf_chf_dm=hcc131*hcc80*sum(hcc15,hcc16,hcc17,hcc18,hcc19);
         if rf_chf_dm=1 then do;
            dm_chf1=0;
            rf_chf1=0;
         end;

         /* create new enrollee profile */

         profile=cats(of hcc1 -- hcc177);

         drop r i_cvd;
      run;

      ods pdf body=outfile1;

         proc freq data=pophcc_p;
            tables category*profile_change/missing;
            title2 "Distribution of Perturbments by Category";
         run;

      ods pdf close;

	  /***********/
      /* Step 3
   
         Regress expenses on the adjusted FFS data.
         Calculate risk factors for the model (estimated coefficients divided by average expenditures)
         and risk scores for each beneficiary.
 
      */

      proc means data=pophcc_p;
         var apy05commabad;
         title2 "Average Expenditures for Adjusted FFS Data";
      run;

      proc reg data=pophcc_p outest=estcoeffs2(drop=_type_ _model_ _rmse_ _depvar_ apy05commabad);
         model apy05commabad = f0_34 -- f95_gt m0_34 -- m95_gt hcc1 -- hcc177 
                               d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                               chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm/noint; 
				                              /* HCCs, demos, interactions */
         output out=predictions2 (keep=hicno apy05commabad predicted) p=predicted;
         title2 "Regression on Adjusted FFS Data";
      run;

      proc means data=predictions2 noprint;
         var apy05commabad predicted;
         output out=mean_predictions2(drop=_type_ _freq_) mean=mean_expense mean_predicted;
         title2 "Adjusted FFS Data, Average Expenses, and Predicted Expenses";
      run;

      data predictions2;
         set predictions2;
         if _n_=1 then set mean_predictions2(keep=mean_expense);
         risk_score=predicted/mean_expense;
      run;

      proc print data=predictions2 (obs=10);
         title2 "Adjusted FFS Data with ID, Original Expenses, Predictions, and Risk Scores";
      run;

      /*  Verify that Average Risk Score equals 1. */

      proc means data=predictions2;
         var risk_score;
         title2 "Average Risk Score";
         title3 "Verify that Average Risk Score = 1";
      run;

      data estcoeffs2;
         set estcoeffs2;
         if _n_=1 then set mean_predictions2;
         drop mean_expense;
      run;

      proc print data=estcoeffs2;
         title2 "Estimated Regression Coefficients for Adjusted FFS Data Regression";
      run;

	  /***********/
      /* Step 4

      Calculate IPARS for the UFFS data. This is done by predicting for the UFFS data (POPHCC) 
      using the model from Step 3. Implement this by running the same PROC REG as in step 3 but
      with the UFFS data appended and not included in the PROC REG fit (expenses set to missing
      for UFFS data records).

      In this step, we predict for the MA enrollees using the normalized adjusted risk factors from the regression on 
      perturbed FFS HCCs. The predicted risk scores for MA enrollees are used in Step 5 below. Note
      the regression does not use the MA enrollee data in the fit, since apy05commabad is missing
      for all MA enrollees. */
 
      data step4;
         set pophcc_p(in=a) pophcc(in=b) masterma(in=c);
         if b then apy05commabad=.;
         if a then dsname='AFFS';
         if b then dsname='UFFS';
         if c then dsname='MAEN';
      run;

      proc reg data=step4 outest=estcoeffs3(drop=_type_ _model_ _rmse_ _depvar_ apy05commabad);
         model apy05commabad = f0_34 -- f95_gt m0_34 -- m95_gt hcc1 -- hcc177 
                               d_hcc5 d_hcc44 d_hcc51 d_hcc52 d_hcc107 dm_chf1 dm_cvd_70hccs
                               chf_copd copd_cvd_cad_70hccs rf_chf1 rf_chf_dm/noint; 
		  		                              /* HCCs, demos, interactions */
         output out=predictions3 (keep=hicno ma_enrollee_number dsname apy05commabad predicted) p=predicted;
         title2 "Regression on Adjusted FFS (AFFS) Data, applied to UFFS Data";
      run;

      ods pdf body=outfile3;

         proc print data=estcoeffs3;
            title2 "Estimated Regression Coefficients for Adjusted FFS Data Regression";
            title3 "Post-Perturbment";
         run;

      ods pdf close;

      proc means data=predictions3;
         class dsname;
         var apy05commabad predicted;
         output out=mean_predictions3(drop=_type_ _freq_) mean=mean_expense mean_predicted;
         title2 "AFFS and UFFS and MA Enrollee Data, Average Expenses, and Predicted Expenses";
      run;

      data mean_predictions3;
         set mean_predictions3;
         if dsname='AFFS';
      run;

      data predictions31;
         set predictions3;
         if _n_=1 then set mean_predictions3(keep=mean_expense);
         if dsname in ('UFFS','MAEN');
         ipars=predicted/mean_expense;
      run;

      proc means data=predictions31;
         var ipars;
         output out=avg_ipars(drop=_type_ _freq_) mean=avg_ipars;
         where dsname='UFFS';
         title2 "Verify that Average IPARS before Normalization is Greater than 1";
      run;

      data predictions31;
         set predictions31;
         if _n_=1 then set avg_ipars(keep=avg_ipars);
         ipars_normalized=ipars/avg_ipars;
      run;

      proc means data=predictions31;
         var ipars_normalized;
         where dsname='UFFS';
         title2 "Verify that Average IPARS after Normalization is 1.0";
      run;

	  /***********/
      /* Step 5

         Compare the predicted pre- and post-risk scores.
 
      */

      /* Create dataset with pre risk score predictions for MA enrollees */ 

      data pre_risk_scores;
         set predictions1;
         if apy05commabad=.;
      run;

      /* Create dataset with post risk score predictions for MA enrollees */

      data post_risk_scores;
         set predictions31;
         if dsname='MAEN';
         drop ipars avg_ipars dsname;
      run;

      /* Compare the Pre and Post Risk Scores */

      proc sort data=pre_risk_scores;
         by ma_enrollee_number;
      run;

      proc sort data=post_risk_scores;
         by ma_enrollee_number;
      run;

      data rs_compare(rename=(risk_score=pre_risk_score ipars_normalized=post_risk_score));
         merge pre_risk_scores post_risk_scores;
         by ma_enrollee_number;
         drop hicno apy05commabad;
      run;

      proc print data=rs_compare(obs=100);
         title2 "Comparison of Pre and Post Risk Scores (100 Enrollees)";
      run;

      /* Calculate comparison statistics */

      data rs_compare;
         set rs_compare;
         difference=post_risk_score-pre_risk_score;
         rd=difference/pre_risk_score;
         ard=abs(rd);
      run;

/*Permanent datasets, rs_compare_#, are saved as SAS datasets and PDF documents*/
      ods pdf body=outfilec;

         proc means data=rs_compare;
            var difference rd ard;
			output out=datout.rs_compare_&r.;
	        title2 "Comparison of Pre and Post Risk Scores";
         run;

      ods pdf close;

      proc datasets;
	     delete pophcc_p estcoeffs2 predictions2 mean_predictions2 estcoeffs3 predictions3
		        mean_predictions3 predictions31 avg_ipars pre_risk_scores post_risk_scores
				rs_compare;
	  run;

   %end; /* end replicate r do loop */

%mend bootstrap;

%bootstrap;

quit;
